Table of contents :

In [1]:
# Import custom packages & modules
import sys
sys.path.append("..")
# Univariate analysis module
from src.analyzer.univariate import *
# Multivariate analysis module
from src.analyzer.multivariate import *
# Data cleaning module
from src.datacleaner import *
# Data preprocessing module
from src.preprocessor import *
# Model evaluation module
from src.evaluator import *
# Ensemble-based methods visualization module
from src.modelizer.ensemble.tree_interpreter import *
# Model selection
from sklearn.model_selection import train_test_split
# Linear models
from sklearn.linear_model import ElasticNet, ElasticNetCV
# Non-linear models
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
# Ensemble-based methods
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
# Compute training time
import datetime
In [2]:
# Model evaluation wrappers

# Define project scorers (RMSE & R2)
third_project_scorers = ['neg_root_mean_squared_error', 'r2']

# Main training wrapper using GridSearchCV 
def train_gridsearch(data, model, param_grid, metric=third_project_scorers, k=10, p=3, v=True):
    # Model name
    model_label = model_name(model)
    # Get training & testing data
    x_train, y_train = data['train']
    x_test, y_test = data['test']
    # Define refit condition (first metric if evaluationg multiple metrics else False)
    refit_cond = metric[0] if type(metric) is list else False
    # Build grid search
    gridsearch = GridSearchCV(model, param_grid, cv=k, scoring=metric, refit=refit_cond)
    # Time the model training
    start_training = datetime.datetime.now()
    # Train model with grid search
    gridsearch.fit(x_train, y_train)
    end_training = datetime.datetime.now()
    # Compute training time
    training_time = end_training - start_training
    # Format training time
    training_time_str = format_run_time(training_time)
    # Trained_model
    trained_model = gridsearch.best_estimator_
    # Get scores from cross validation
    cv_scores = {}
    for scorer_label in (metric if type(metric) is list else [metric]):
        if scorer_label.startswith('neg'):
            formatted_label = "".join([w[0] for w in scorer_label.replace('neg_', '').split('_')])
            formatted_score = round(np.abs(gridsearch.cv_results_[f'mean_test_{scorer_label}'])[0], p)
            cv_scores[formatted_label] = formatted_score
        else:
            cv_scores[scorer_label] = round(gridsearch.cv_results_[f'mean_test_{scorer_label}'][0], p)
    # Get scores from testing set
    testing_set_scores = get_model_scores(trained_model, x_test, y_test, list(cv_scores.keys()), p, v)
    # Display cross validation mean scores
    if v:
        print_score_results(cv_scores, set_type='train')
    # Build model dictionary which contains GridSearchCV & model instances (with model name)
    model_data = {'gs': gridsearch,                                 # GridSearchCV trained instance
                  'model': trained_model,                           # Model trained instance
                  'model_name': model_label}                        # Model name
    # Build additional evaluation data dictionary
    additional_evaluation_data = {'time': training_time_str,        # Training time
                                  'n_features': x_train.shape[1],   # Selected features
                                  'learning_potential': None}       # Learning potential
    # Build results dictionary (merge dictionnaries)
    results = dict(**model_data, **testing_set_scores, **additional_evaluation_data)
    return results


def select_most_important_features(features_coefs_df,
                                   n=None,
                                   method='cumsum',
                                   model=None,
                                   thr='mean',
                                   q_value=0.5,
                                   thr_value=None,
                                   v=False):
    """
    Wrapper which select most important features from machine learning model, based on specified method
    (cumulative threshold or)
    """
    # Feature importance selection method
    if method is 'cumsum':                                # if thr is 'cumsum'
        n_mif_data = features_coefs_df.iloc[:n]
        n_mif_labels = n_mif_data['feature'].tolist()
    elif method is 'threshold':                           # else
        # Threshold method
        if thr is 'q': # quantile
            mif_thr = features_coefs_df['coefficient'].quantile(q=q_value)
        elif thr is 'mean': # mean
            mif_thr = features_coefs_df['coefficient'].mean()
        else: # arbitrary number
            mif_thr = thr_value
        # N.B : function from mlearn preprocessor module
        n_mif_data = filter_features_by_threshold(X,
                                                  X_train_std,
                                                  X_test_std,
                                                  model,
                                                  mif_thr,
                                                  verbose=v)
        n_mif_labels = n_mif_data['labels']
    # Return n selected features dataframe & labels
    return n_mif_data, n_mif_labels


def run_training_cycle(features_reduced,
                       model_param_grid,
                       model=None,
                       target='energy',
                       training_wrapper='gs',
                       k=5,
                       s=third_project_scorers,
                       p=3,
                       v=True):
    """
    Wrapper which run a training cycle based on reduced features
    """
    # Filter training and testing features data (reduce the number of features)
    X_train_std_reduced = X_train_std_df.loc[:, features_reduced]
    X_test_std_reduced = X_test_std_df.loc[:, features_reduced]
    # Define target type
    if target is 'energy':
        y_train_target, y_test_target = y_train_energy, y_test_energy
    elif target is 'emissions':
        y_train_target, y_test_target = y_train_emissions, y_test_emissions
    # Build training & testing data dictionary for each target   
    training_and_testing_data_reduced = {"train": [X_train_std_reduced, y_train_target],
                                         "test": [X_test_std_reduced, y_test_target]}
    # Define training wrapper type (elastic net cross val or gridsearchCV)
    if training_wrapper == 'en':
        results = train_elastic_net(training_and_testing_data_reduced,
                                    model_param_grid)
    elif training_wrapper == 'gs':
        results = train_gridsearch(training_and_testing_data_reduced,
                                   model,
                                   model_param_grid,
                                   metric=s,
                                   k=k,       
                                   p=p,
                                   v=v)
    return results, training_and_testing_data_reduced


# Elastic Net Wrapper

def train_elastic_net(data, params_grid, k=10, v=True, s=['rmse', 'r2']):
    # Get training and testing sets from data dictionary
    X_train_std, y_train = data["train"]
    X_test_std, y_test = data["test"]
    # Get (hyper)parameters from parameters grid
    alpha_range, l1_ratio_range = params_grid.values()
    # Build model using cross validation 
    elastic_net = ElasticNetCV(alphas=alpha_range,
                               cv=k,
                               l1_ratio=l1_ratio_range)
    start_training = datetime.datetime.now()
    # Train model with cross validation
    elastic_net.fit(X_train_std, y_train)
    end_training = datetime.datetime.now()
    # Compute training time
    training_time = end_training - start_training
    # Format training time
    training_time_str = format_run_time(training_time)
    # Optimal parameters
    optimal_alpha = elastic_net.alpha_
    optimal_l1_ratio = elastic_net.l1_ratio_
    if v:
        print("Alpha : {} | l1_ratio : {}\n".format(optimal_alpha,
                                                    optimal_l1_ratio))
    # Regularization path
    alphas, coefs, _ = elastic_net.path(X_train_std,
                                        y_train,
                                        alphas=alpha_range,
                                        l1_ratio=optimal_l1_ratio)
    # Format scorer label (example : neg_root_mean_squared_error --> rmse)
    formatted_s = []
    for scorer_label in s:
        if scorer_label.startswith('neg'):
            scorer_label = "".join([w[0] for w in scorer_label.replace('neg_', '').split('_')])
        formatted_s.append(scorer_label)
    # Model evaluation
    testing_set_scores = get_model_scores(elastic_net,
                                          X_test_std,
                                          y_test,
                                          scorer=formatted_s,
                                          verbose=v)
    # Build model dictionary which contains model data (instance & name)
    model_data = {'model': elastic_net,
                  'model_name': model_name(elastic_net)}
    # Build additional evaluation data dictionary
    additional_evaluation_data = {'time': training_time_str,            # Training time
                                  'n_features': X_train_std.shape[1],   # Selected features
                                  'learning_potential': None,           # Learning potential
                                  'best_params': [optimal_alpha, optimal_l1_ratio],
                                  'reg_path_data': [alphas, coefs]}            
    # Build results dictionary (merge dictionnaries) 
    results = dict(**model_data, **testing_set_scores, **additional_evaluation_data)
    return results
In [3]:
# Import du dataset
df_raw = pd.read_csv('../data/csv/seattle_model_data_ENERGYSTARScore.csv')
df = df_raw.copy()
print(df.shape)
df.head(2)
(4692, 163)
Out[3]:
PropertyGFATotal LargestPropertyUseTypeGFA SecondLargestPropertyUseTypeGFA ThirdLargestPropertyUseTypeGFA NumberofFloors ENERGYSTARScore SiteEnergyUse(kBtu) TotalGHGEmissions Main_energy_electricity Main_energy_gaz ... Neighborhood_Downtown Neighborhood_East Neighborhood_Greater duwamish Neighborhood_Lake union Neighborhood_Magnolia / queen anne Neighborhood_North Neighborhood_Northeast Neighborhood_Northwest Neighborhood_Southeast Neighborhood_Southwest
0 12.192825 11.884123 10.680999 9.093807 6.0 28.0 15.930252 5.166499 1 0 ... 0 0 0 0 0 0 0 0 0 0
1 13.012001 12.552845 11.705857 10.682262 8.0 67.0 16.706546 5.631964 1 0 ... 0 0 0 0 0 0 0 0 0 0

2 rows × 163 columns

In [4]:
# Targets
targets_cols = ['SiteEnergyUse(kBtu)', 'TotalGHGEmissions']

# Features
features_cols = [col for col in df.columns if col not in targets_cols]

# Features data
df_without_targets = df[features_cols]

I - Preprocessing

1 - Feature Selection

1.1 - Filter features with identical or null variance

In [5]:
# N.B : functions from mlearn prepocessor module 

# Method :
# null_variance_cols = features_with_null_variances(df, verbose=True)
# identical_variance_cols = features_with_identical_variances(df, col_kept='last', verbose=True)
# invalid_variances_cols = null_variance_cols + identical_variance_cols
# df = df[[col for col in df.columns if col not in invalid_variances_cols]]

# Remove invalid features from dataframe
df = filter_invalid_variances(df, feature_kept='last')
0/163 features with null variance, reduction of 0.0%
9/163 features with identical variance, reduction of 5.5%

1.2 - Filter correlated features

In [6]:
# N.B : functions from mlearn prepocessor module 

# Get filtered features dataframe
features = filter_correlated_features(df_without_targets, threshold=0.5, verbose=True)
# features = df_without_targets 
33/161 features correlated, reduction of 20.5%
In [7]:
features.shape
Out[7]:
(4692, 128)
In [ ]:
# Remove energy variables
# delete_cols(features, ['Main_energy_electricity', 'Main_energy_steam'])

2 - Split training set & testing set

In [8]:
# Features data
X = features

# Extract features labels
training_features = X.columns.tolist()

# Targets data
y = df[targets_cols]

# Training & testing sets split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Split targets (energy consumption & gaz emissions features)
y_train_energy, y_train_emissions = [y_train[target] for target in targets_cols]
y_test_energy, y_test_emissions = [y_test[target] for target in targets_cols]

3 - Feature Scaling (standardize data)

In [9]:
# Standardize data (center & reduce)
# N.B : function from mlearn preprocessor module

X_train_std, std_scaler = standard_scaler(X_train, return_std_scaler=True)
X_test_std = std_scaler.transform(X_test)

# Build training & testing data dictionary for each target
energy_data = {"train": [X_train_std, y_train_energy],
               "test": [X_test_std, y_test_energy]}

emissions_data = {"train": [X_train_std, y_train_emissions],
                  "test": [X_test_std, y_test_emissions]}
In [10]:
# Build dataframes from standardized training & testing data
# => permet de faciliter le filtrage des features qui contribuent le plus au model
X_train_std_df = pd.DataFrame(X_train_std, columns=training_features)
X_test_std_df = pd.DataFrame(X_test_std, columns=training_features)

II - Dummies regressions :

1 - Dummy regression ( target : 'SiteEnergyUse(kBtu)' )

In [11]:
# Build dummy regression with mean as strategy
y_pred_dum_energy, y_test_dum_energy = dummy_regression(X_train_std,
                                                        y_train_energy,
                                                        X_test_std,
                                                        y_test_energy,
                                                        strategy='mean')

print('Dummy regression : RMSE = {}'.format(mean_squared_error(y_test_energy,
                                                               y_pred_dum_energy,
                                                               squared=False)))

print('Dummy regression : R2 = {}'.format(r2_score(y_test_energy, y_pred_dum_energy)))
Dummy regression : RMSE = 1.1410705875584375
Dummy regression : R2 = -0.001408066617396786

2 - Dummy regression ( target : 'TotalGHGEmissions' )

In [12]:
y_pred_dum_emissions, y_test_dum_emissions = dummy_regression(X_train_std,
                                                               y_train_emissions,
                                                               X_test_std,
                                                               y_test_emissions,
                                                               strategy='mean')

print('Dummy regression : RMSE = {}'.format(mean_squared_error(y_test_emissions,
                                                               y_pred_dum_emissions,
                                                               squared=False)))

print('Dummy regression : R2 = {}'.format(r2_score(y_test_emissions, y_pred_dum_emissions)))
Dummy regression : RMSE = 1.5205513788212472
Dummy regression : R2 = -0.0019039504651874317

III - Modelize :

1 - Linear model : ElasticNet

1.1 - ElasticNet (target : 'SiteEnergyUse(kBtu)' )

1.1.1 - Train model (optimize hyperparameters)

In [203]:
# alphas range : np.logspace(-3, 3, 10), np.arange(0.001, 0.009, 0.001)
en_params_grid = {"alphas": np.logspace(-3, 3, 10),
                  "l1_ratio": [0.2, 0.4, 0.7, 0.75, 0.8, 0.95, 0.99]}

# Train Elastic Net with cross validation (result is a dictionary)
enet_en_data = train_elastic_net(energy_data, en_params_grid, k=10)
Alpha : 0.001 | l1_ratio : 0.99

ElasticNetCV
--------------------------
Testing set performances :
- RMSE = 0.361
- R2 = 0.9

1.1.2 - Regularization path

In [14]:
# Get hyperparameter & coefficient ranges
en_alphas, en_coefs = enet_en_data["reg_path_data"]

# N.B : function from mlearn evaluator module
plot_regularization_path(en_alphas, en_coefs.T, training_features, n_features_labels=10)

1.1.3 - Select most important features (method : cumulative feature importance selection)

In [15]:
# N.B : Elastic Net reduces the coefficients of irrelevant features to 0. 
# We have therefore selected here features with positive or negative coefficients

elastic_net_en_coefs = enet_en_data["model"].coef_

# Extract feature importance filtered coefficients dataframes
elastic_net_en_features_coefs_df = get_features_importance(training_features,
                                                           elastic_net_en_coefs,
                                                           abs_coefs=True,
                                                           non_zero_coefs=True,
                                                           verbose=True)
# N.B : function from mlearn preprocessor module
plot_cumulative_features_importance(elastic_net_en_features_coefs_df, threshold=0.90, plot_size=(12, 6))
113/128 features selected, reduction of 11.7%
In [16]:
# Select the 43 most important features
enet_en_mif_43_data = elastic_net_en_features_coefs_df.iloc[:43]
enet_en_mif_43_labels = enet_en_mif_43_data['feature'].tolist()
In [17]:
# Visualize n most important features
# N.B : function from mlearn preprocessor module
plot_n_top_features(enet_en_mif_43_data,
                    model_name(enet_en_data["model"]),
                    n=10,
                    plot_size=(12, 4))

1.1.4 - Second training cycle (with cumulative feature importance selection method)

In [18]:
# Second training cycle with features reduced (173 -> 43)

X_train_std_en_43f = X_train_std_df.loc[:, enet_en_mif_43_labels]
X_test_std_en_43f = X_test_std_df.loc[:, enet_en_mif_43_labels]


# Build training & testing data dictionary for each target
en_data_reduced_43f = {"train": [X_train_std_en_43f, y_train_energy],
                       "test": [X_test_std_en_43f, y_test_energy]}


enet_en_reduced_data_43f = train_elastic_net(en_data_reduced_43f, en_params_grid)
Alpha : 0.001 | l1_ratio : 0.2

ElasticNetCV
--------------------------
Testing set performances :
- RMSE = 0.36
- R2 = 0.9

1.1.5 - Select most important features (method : threshold feature importance selection)

In [19]:
# We use the same model (from first training cycle)
mif_q3 = elastic_net_en_features_coefs_df['coefficient'].quantile(q=0.75)
ffd = filter_features_by_threshold(X, X_train_std, X_test_std, enet_en_data['model'], mif_q3)
print('Features selected : {}'.format(len(ffd['labels'])))
enet_en_mif_29_labels = ffd['labels']
Features selected : 29

1.1.6 - Second training cycle (with threshold feature selection method)

In [204]:
# Second training cycle with features reduced (173 -> 29)

X_train_std_en_29f = X_train_std_df.loc[:, enet_en_mif_29_labels]
X_test_std_en_29f = X_test_std_df.loc[:, enet_en_mif_29_labels]


# Build training & testing data dictionary for each target
en_data_reduced_29f = {"train": [X_train_std_en_29f, y_train_energy],
                       "test": [X_test_std_en_29f, y_test_energy]}


enet_en_reduced_data_29f = train_elastic_net(en_data_reduced_29f, en_params_grid)
# Cumulative feature importance selection method seems to be slightly better
# (fewer variables selected and a slightly better r2)
Alpha : 0.001 | l1_ratio : 0.2

ElasticNetCV
--------------------------
Testing set performances :
- RMSE = 0.361
- R2 = 0.9

1.1.7 - Training curve

In [21]:
# Plot training curve

# N.B : function from mlearn evaluator module
plot_validation_curve(ElasticNet(alpha=0.001, l1_ratio=0.2),
                      X_train_std_en_29f,
                      y_train_energy,
                      'alpha',
                      np.logspace(-3, 3, 10),
                      log_scale=True,
                      scorer='neg_root_mean_squared_error')

1.1.8 - Learning curve

In [22]:
# Plot learning curve

# N.B : function from mlearn evaluator module
plot_learning_curve(ElasticNet(alpha=0.001, l1_ratio=0.2, max_iter=2000),
                    X_train_std_en_29f,
                    y_train_energy,
                    train_sizes_ratio=np.linspace(0.1, 1, 10),
                    scorer='neg_root_mean_squared_error')

# Potential for improvement : No
# Model performance seems to stagnate from around 2 000 observations
In [213]:
enet_en_reduced_data_29f['learning_potential'] = 'No'

1.2 - ElasticNet (target : 'TotalGHGEmissions' )

N.B : From here we will use wrappers functions in order to :

  • select the most important features
  • repeat training cycles

1.2.1 - Train model (optimize hyperparameters)

In [24]:
# Train model
em_params_grid = {"alphas": np.logspace(-3, 3, 10),
                  "l1_ratio": [0.2, 0.4, 0.7, 0.75, 0.8, 0.95, 0.99]}

enet_em_data = train_elastic_net(emissions_data, em_params_grid, k=10)
Alpha : 0.004641588833612777 | l1_ratio : 0.7

ElasticNetCV
--------------------------
Testing set performances :
- RMSE = 0.711
- R2 = 0.781

1.2.2 - Regularization path

In [25]:
em_alphas, em_coefs = enet_em_data["reg_path_data"]

plot_regularization_path(em_alphas, em_coefs.T, training_features, n_features_labels=10)

1.2.3 - Select most important features (method : cumulative feature importance selection)

In [26]:
elastic_net_em_coefs = enet_em_data["model"].coef_

elastic_net_em_features_coefs_df = get_features_importance(training_features,
                                                           elastic_net_em_coefs,
                                                           abs_coefs=True,
                                                           non_zero_coefs=True,
                                                           verbose=True)

plot_cumulative_features_importance(elastic_net_em_features_coefs_df, threshold=0.90, plot_size=(12, 6))
112/128 features selected, reduction of 12.5%
In [27]:
enet_em_mif_51_data, enet_em_mif_51_labels = select_most_important_features(elastic_net_em_features_coefs_df,
                                                                            n=51,
                                                                            method='cumsum')
# Visualize n most important features
plot_n_top_features(enet_em_mif_51_data,
                    model_name(enet_em_data["model"]),
                    n=10,
                    plot_size=(12, 4))

1.2.4 - Second training cycle (with cumulative feature importance selection method)

In [28]:
# Second training cycle with features reduced (173 -> 69)

# em_params_grid = {"alphas": np.arange(0.0020, 0.0030, 0.0001),
#                   "l1_ratio": [0.2, 0.4, 0.7, 0.75, 0.8, 0.95, 0.99]}

enet_em_reduced_data_51f, train_and_test_data_reduced_51f  = run_training_cycle(enet_em_mif_51_labels,
                                                                                em_params_grid,
                                                                                training_wrapper='en') 
Alpha : 0.001 | l1_ratio : 0.4

ElasticNetCV
--------------------------
Testing set performances :
- RMSE = 0.363
- R2 = 0.899

1.2.5 - Select most important features (method : threshold feature importance selection)

In [29]:
enet_em_mif_n_data, enet_em_mif_n_labels = select_most_important_features(elastic_net_em_features_coefs_df,
                                                                          method='threshold',
                                                                          model=enet_em_data["model"],
                                                                          thr='q',
                                                                          q_value=0.75, # Q3
                                                                          v=True)
# # Visualize n most important features
# plot_n_top_features(mif_n_data['data'],
#                     model_name(enet_emissions_data["model"]),
#                     n=10,
#                     plot_size=(12, 4))
28/128 selected features, reduction of 21.9%

1.2.6 - Second training cycle (with threshold feature selection method)

In [30]:
# Second training cycle with features reduced (173 -> 28)

enet_em_reduced_data_28f, train_and_test_data_reduced_28f  = run_training_cycle(enet_em_mif_n_labels,
                                                                                em_params_grid,
                                                                                training_wrapper='en') 
Alpha : 0.001 | l1_ratio : 0.2

ElasticNetCV
--------------------------
Testing set performances :
- RMSE = 0.364
- R2 = 0.898

1.2.7 - Training curve

In [31]:
# Plot training curve

x_train_std_em_28f = train_and_test_data_reduced_28f['train'][0]

plot_validation_curve(ElasticNet(alpha=0.001, l1_ratio=0.2),
                      x_train_std_em_28f,
                      y_train_emissions,
                      'alpha',
                      np.logspace(-3, 3, 10),
                      log_scale=True,
                      scorer='neg_root_mean_squared_error')

1.2.8 - Learning curve

In [32]:
# Plot learning curve

plot_learning_curve(ElasticNet(alpha=0.001, l1_ratio=0.2, max_iter=2000),
                    x_train_std_em_28f,
                    y_train_emissions,
                    train_sizes_ratio=np.linspace(0.1, 1, 10),
                    scorer='neg_root_mean_squared_error')

# Potential for improvement : No
# The curve stagnates around 2500 observations
In [214]:
enet_em_reduced_data_28f['learning_potential'] = 'No'

2 - Ensemble methods : Random forest, XGBoost

1.1 - Random Forest (target : 'SiteEnergyUse(kBtu)' )

1.1.1 - Train model (optimize hyperparameters)

In [34]:
%%time 

# Random Forest

rfr_model = RandomForestRegressor(random_state=42)

#    n_estimators range      |       max_depth range
#  np.arange(200, 550, 50)   |      [2, 4, 6, 8, 10]
# [400, 500, 600]

# Optimal hyperparameters
rfr_en_params = {'n_estimators': [100], # 100
#                'max_depth': [40],
#                'max_features': ['auto', 'sqrt', 'log2']
                 'min_samples_leaf': [1]}

rfr_en_data = train_gridsearch(energy_data,
                               rfr_model,
                               rfr_en_params,
                               k=10)

# print(rfr_en_data['model'])
"""
- RMSE = 0.333
- R2 = 0.915
"""
RandomForestRegressor
--------------------------
Testing set performances :
- RMSE = 0.334
- R2 = 0.914
--------------------------------
Training set performances (CV) :
- RMSE = 0.332
- R2 = 0.905
CPU times: user 29.3 s, sys: 94.8 ms, total: 29.4 s
Wall time: 29.5 s
Out[34]:
'\n- RMSE = 0.333\n- R2 = 0.915\n'

1.1.2 - Select most important features (method : cumulative feature importance selection)

In [35]:
rfr_en_model = rfr_en_data['model']
rfr_en_coefs = rfr_en_model.feature_importances_

rfr_en_features_coefs_df = get_features_importance(training_features,
                                                   rfr_en_coefs,
                                                   abs_coefs=True,
                                                   non_zero_coefs=False,
                                                   verbose=False)

plot_cumulative_features_importance(rfr_en_features_coefs_df, threshold=0.90, plot_size=(12, 6))
In [36]:
rfr_en_mif_7_data, rfr_en_mif_7_labels = select_most_important_features(rfr_en_features_coefs_df,
                                                                        n=7,
                                                                        method='cumsum')
# Visualize n most important features
plot_n_top_features(rfr_en_mif_7_data,
                    model_name(rfr_en_model),
                    n=7,
                    plot_size=(12, 4))

1.1.3 - Second training cycle (with cumulative feature importance selection method)

In [37]:
# Second training cycle with features reduced (173 -> 7)

rfr_en_model_c2 = RandomForestRegressor(random_state=42)

rfr_en_params = {'n_estimators': [100],
#                'max_depth': [40],
#                'max_features': ['auto', 'sqrt', 'log2']
                 'min_samples_leaf': [1]}

rfr_en_reduced_data_7f, train_and_test_data_reduced_7f  = run_training_cycle(rfr_en_mif_7_labels,
                                                                               rfr_en_params,
                                                                               rfr_en_model_c2) 
RandomForestRegressor
--------------------------
Testing set performances :
- RMSE = 0.371
- R2 = 0.894
--------------------------------
Training set performances (CV) :
- RMSE = 0.394
- R2 = 0.867

1.1.4 - Display random decision tree (from cumulative feature importance selection method)

In [38]:
# Display naiv decision tree
rfr_en_naiv_model = RandomForestRegressor(n_estimators=100, random_state=42, max_depth=3)
rfr_en_naiv_model.fit(*train_and_test_data_reduced_7f['train'])

display_decision_tree(rfr_en_naiv_model,
                      rfr_en_mif_7_labels,
                      targets_cols[0],
                      tree_nb=5)
Out[38]:

1.1.5 - Select most important features (method : threshold feature importance selection)

In [39]:
rfr_en_mif_n_data, rfr_en_mif_n_labels = select_most_important_features(rfr_en_features_coefs_df,
                                                                        method='threshold',
                                                                        model=rfr_en_model,
                                                                        thr='q',
                                                                        q_value=0.75, # Q3
                                                                        v=True)
32/128 selected features, reduction of 25.0%

1.1.6 - Second training cycle (with threshold feature selection method)

In [40]:
# Second training cycle with features reduced (173 -> 32)

rfr_en_model_c2 = RandomForestRegressor(random_state=42)

rfr_en_reduced_data_32f, train_and_test_data_reduced_32f  = run_training_cycle(rfr_en_mif_n_labels,
                                                                               rfr_en_params,
                                                                               rfr_en_model_c2)
RandomForestRegressor
--------------------------
Testing set performances :
- RMSE = 0.337
- R2 = 0.913
--------------------------------
Training set performances (CV) :
- RMSE = 0.343
- R2 = 0.899

1.1.7 - Display random decision tree (from threshold feature selection method)

In [41]:
# Display naiv decision tree

rfr_en_naiv_model = RandomForestRegressor(n_estimators=100, random_state=42, max_depth=3)
rfr_en_naiv_model.fit(*train_and_test_data_reduced_32f['train'])

display_decision_tree(rfr_en_naiv_model,
                      rfr_en_mif_n_labels,
                      targets_cols[0],
                      tree_nb=5)
Out[41]:

1.1.8 - Training curve

In [42]:
# Plot training curve

x_train_std_en_7f = train_and_test_data_reduced_7f['train'][0]

plot_validation_curve(RandomForestRegressor(n_estimators=100, random_state=42),
                      x_train_std_en_7f,
                      y_train_energy,
                      'n_estimators',
                      np.arange(200, 500, 200),
                      log_scale=False,
                      scorer='neg_root_mean_squared_error')

1.1.9 - Learning curve

In [43]:
# Plot learning curve

plot_learning_curve(RandomForestRegressor(n_estimators=100, min_samples_leaf=1, random_state=42),
                    x_train_std_en_7f,
                    y_train_energy,
                    train_sizes_ratio=np.linspace(0.1, 1, 10),
                    scorer='neg_root_mean_squared_error')

# Potential for improvement : Yes
In [44]:
rfr_en_reduced_data_7f['learning_potential'] = 'Yes'

1.2 - Random Forest (target : 'TotalGHGEmissions' )

1.2.1 - Train model (optimize hyperparameters)

In [45]:
%%time 

# Random Forest

rfr_model = RandomForestRegressor(random_state=42) #,n_jobs=-1)

#    n_estimators range      |       max_depth range
#  np.arange(200, 550, 50)   |      [2, 4, 6, 8, 10]
# [400, 500, 600]

rfr_em_params = {'n_estimators': [200], # 350 3min
#                'max_depth': [40],
#                'max_features': ['auto', 'sqrt', 'log2']
                 'min_samples_leaf': [1]}

rfr_em_data = train_gridsearch(emissions_data,
                               rfr_model,
                               rfr_em_params,
                               k=10)

print(rfr_em_data['model'])
RandomForestRegressor
--------------------------
Testing set performances :
- RMSE = 0.589
- R2 = 0.849
--------------------------------
Training set performances (CV) :
- RMSE = 0.56
- R2 = 0.845
RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=200, n_jobs=None, oob_score=False,
                      random_state=42, verbose=0, warm_start=False)
CPU times: user 58.9 s, sys: 266 ms, total: 59.2 s
Wall time: 59.3 s

1.2.2 - Select most important features (method : cumulative feature importance selection)

In [46]:
rfr_em_model = rfr_em_data['model']
rfr_em_coefs = rfr_em_model.feature_importances_

rfr_em_features_coefs_df = get_features_importance(training_features,
                                                   rfr_em_coefs,
                                                   abs_coefs=True,
                                                   non_zero_coefs=False,
                                                   verbose=False)

plot_cumulative_features_importance(rfr_em_features_coefs_df, threshold=0.90, plot_size=(12, 6))
In [47]:
rfr_em_mif_8_data, rfr_em_mif_8_labels = select_most_important_features(rfr_em_features_coefs_df,
                                                                        n=8,
                                                                        method='cumsum')
# Visualize n most important features
plot_n_top_features(rfr_em_mif_8_data,
                    model_name(rfr_em_model),
                    n=8,
                    plot_size=(12, 4))

1.2.3 - Second training cycle (with cumulative feature importance selection method)

In [48]:
# Second training cycle with features reduced (173 -> 8)

rfr_em_model_c2 = RandomForestRegressor(random_state=42, n_jobs=-1)

rfr_em_reduced_data_8f, train_and_test_data_reduced_8f  = run_training_cycle(rfr_em_mif_8_labels,
                                                                             rfr_em_params,
                                                                             rfr_em_model_c2) 
RandomForestRegressor
--------------------------
Testing set performances :
- RMSE = 0.365
- R2 = 0.897
--------------------------------
Training set performances (CV) :
- RMSE = 0.382
- R2 = 0.874

1.2.4 - Display random decision tree (from cumulative feature importance selection method)

In [49]:
# Display naiv decision tree
rfr_em_naiv_model = RandomForestRegressor(n_estimators=200, random_state=42, max_depth=3)
rfr_em_naiv_model.fit(*train_and_test_data_reduced_8f['train'])

display_decision_tree(rfr_em_naiv_model,
                      rfr_em_mif_8_labels,
                      targets_cols[1],
                      tree_nb=5)
Out[49]:

1.2.5 - Select most important features (method : threshold feature importance selection)

In [50]:
rfr_em_mif_n_data, rfr_em_mif_n_labels = select_most_important_features(rfr_em_features_coefs_df,
                                                                        method='threshold',
                                                                        model=rfr_em_model,
                                                                        thr='q',
                                                                        q_value=0.75, # Q3
                                                                        v=True)
32/128 selected features, reduction of 25.0%

1.2.6 - Second training cycle (with threshold feature selection method)

In [51]:
# Second training cycle with features reduced (173 -> 32)

rfr_em_model_c2 = RandomForestRegressor(random_state=42)

rfr_em_reduced_data_32f, train_and_test_data_reduced_32f = run_training_cycle(rfr_em_mif_n_labels,
                                                                              rfr_em_params,
                                                                              rfr_em_model_c2)
RandomForestRegressor
--------------------------
Testing set performances :
- RMSE = 0.336
- R2 = 0.913
--------------------------------
Training set performances (CV) :
- RMSE = 0.346
- R2 = 0.897

1.2.7 - Display random decision tree (from threshold feature selection method)

In [52]:
# Display naiv decision tree

rfr_em_naiv_model = RandomForestRegressor(n_estimators=200, random_state=42, max_depth=3)
rfr_em_naiv_model.fit(*train_and_test_data_reduced_32f['train'])

display_decision_tree(rfr_em_naiv_model,
                      rfr_em_mif_n_labels,
                      targets_cols[1],
                      tree_nb=5)
Out[52]:

1.2.8 - Training curve

In [53]:
# Plot training curve

x_train_std_em_8f = train_and_test_data_reduced_8f['train'][0]

plot_validation_curve(RandomForestRegressor(n_estimators=200, random_state=42),
                      x_train_std_em_8f,
                      y_train_energy,
                      'n_estimators',
                      np.arange(200, 500, 200),
                      log_scale=False,
                      scorer='neg_root_mean_squared_error')

1.2.9 - Learning curve

In [54]:
# Plot learning curve

plot_learning_curve(RandomForestRegressor(n_estimators=200, random_state=42),
                    x_train_std_em_8f,
                    y_train_energy,
                    train_sizes_ratio=np.linspace(0.1, 1, 10),
                    scorer='neg_root_mean_squared_error')

# Potential for improvement : Yes
In [55]:
rfr_em_reduced_data_8f['learning_potential'] = 'Yes'

2.1 - XGBoost (target : 'SiteEnergyUse(kBtu)' )

2.1.1 - Train model (optimize hyperparameters)

In [56]:
%%time

# XGBoost
xgb_reg = xgb.XGBRegressor(objective='reg:squarederror')

# xgb_reg_params = {'n_estimators': [150, 200, 250],
#                   'alpha': [0.5, 2, 5],
#                   'learning_rate': np.arange(0.1, 1.1, 0.1),
#                   'max_depth': [5, 10, 20],
#                   'colsample_bytree': [0.3],
#                   #'subsample': np.arange(0.1, 1.1, 0.1)
#                  }

xgb_reg_en_params = {'n_estimators': [500],
                     'alpha': [0.5], # selected from np.arange(1, 2, 0.1),
                     'learning_rate': [0.062], # 0.065
                     'max_depth': [20],
                     'colsample_bytree': [0.35]
                     }

xgb_en_data = train_gridsearch(energy_data,
                               xgb_reg,
                               xgb_reg_en_params)

"""
- RMSE = 0.276
- R2 = 0.942
"""

print(xgb_en_data['model'])
XGBRegressor
--------------------------
Testing set performances :
- RMSE = 0.276
- R2 = 0.942
--------------------------------
Training set performances (CV) :
- RMSE = 0.268
- R2 = 0.938
XGBRegressor(alpha=0.5, base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.35, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.062, max_delta_step=0, max_depth=20,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=500, n_jobs=0, num_parallel_tree=1,
             objective='reg:squarederror', random_state=0, reg_alpha=0.5,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)
CPU times: user 3min 25s, sys: 1.59 s, total: 3min 26s
Wall time: 58 s

2.1.2 - Select most important features (method : cumulative feature importance selection)

In [57]:
xgb_reg_en_model = xgb_en_data['model']
xgb_reg_en_coefs = xgb_reg_en_model.feature_importances_

xgb_reg_en_features_coefs_df = get_features_importance(training_features,
                                                       xgb_reg_en_coefs,
                                                       abs_coefs=True,
                                                       non_zero_coefs=False,
                                                       verbose=False)

plot_cumulative_features_importance(xgb_reg_en_features_coefs_df, threshold=0.90, plot_size=(12, 6))
In [58]:
xgb_reg_en_mif_29_data, xgb_reg_en_mif_29_labels = select_most_important_features(xgb_reg_en_features_coefs_df,
                                                                                  n=29,
                                                                                  method='cumsum')
# Visualize n most important features
plot_n_top_features(xgb_reg_en_mif_29_data,
                    model_name(xgb_reg_en_model),
                    n=10,
                    plot_size=(12, 4))

2.1.3 - Second training cycle (with cumulative feature importance selection method)

In [59]:
%%time 

# Second training cycle with features reduced (173 -> 29)

xgb_reg_en_model_c2 = xgb.XGBRegressor(objective='reg:squarederror')


xgb_reg_en_reduced_data_29f, train_and_test_data_reduced_29f  = run_training_cycle(xgb_reg_en_mif_29_labels,
                                                                                   xgb_reg_en_params,
                                                                                   xgb_reg_en_model_c2) 
XGBRegressor
--------------------------
Testing set performances :
- RMSE = 0.285
- R2 = 0.938
--------------------------------
Training set performances (CV) :
- RMSE = 0.291
- R2 = 0.927
CPU times: user 41.1 s, sys: 586 ms, total: 41.7 s
Wall time: 11.9 s

2.1.4 - Display random decision tree (from cumulative feature importance selection method)

In [60]:
# Display naiv decision tree
xgb.plot_tree(xgb_reg_en_reduced_data_29f['model'], num_trees=5)
plt.show()

2.1.5 - Select most important features (method : threshold feature importance selection)

In [61]:
xgb_reg_en_mif_n_data, xgb_reg_en_mif_n_labels = select_most_important_features(xgb_reg_en_features_coefs_df,
                                                                                method='threshold',
                                                                                model=xgb_reg_en_model,
                                                                                thr='q',
                                                                                q_value=0.75, # Q3
                                                                                v=True)
32/128 selected features, reduction of 25.0%

2.1.6 - Second training cycle (with threshold feature selection method)

In [62]:
# Second training cycle with features reduced (173 -> 32)

xgb_reg_en_model_c2 = xgb.XGBRegressor(objective='reg:squarederror')

xgb_reg_en_reduced_data_32f, train_and_test_data_reduced_32f  = run_training_cycle(xgb_reg_en_mif_n_labels,
                                                                                   xgb_reg_en_params,
                                                                                   xgb_reg_en_model_c2) 
XGBRegressor
--------------------------
Testing set performances :
- RMSE = 0.283
- R2 = 0.938
--------------------------------
Training set performances (CV) :
- RMSE = 0.291
- R2 = 0.927

2.1.7 - Display random decision tree (from threshold feature selection method)

In [63]:
# Display naiv decision tree
xgb.plot_tree(xgb_reg_en_reduced_data_32f['model'], num_trees=5)
plt.show()

2.1.8 - Training curve

In [64]:
# Plot training curve

x_train_std_en_29f = train_and_test_data_reduced_29f['train'][0]


xgb_reg_en_params = {'objective': 'reg:squarederror',
                     'n_estimators': 500,
                     'alpha': 0.5,
                     'learning_rate': 0.062,
                     'max_depth': 20,
                     'colsample_bytree': 0.35}


plot_validation_curve(xgb.XGBRegressor(**xgb_reg_en_params),
                      x_train_std_en_29f,
                      y_train_energy,
                      'n_estimators',
                      np.arange(200, 800, 100),
                      log_scale=False,
                      scorer='neg_root_mean_squared_error')

2.1.9 - Learning curve

In [65]:
# Plot learning curve

plot_learning_curve(xgb.XGBRegressor(**xgb_reg_en_params),
                    x_train_std_en_29f,
                    y_train_energy,
                    train_sizes_ratio=np.linspace(0.1, 1, 10),
                    scorer='neg_root_mean_squared_error')

# Potential for improvement : Yes
In [66]:
xgb_reg_en_reduced_data_29f['learning_potential'] = 'Yes'

2.2 - XGBoost (target : 'TotalGHGEmissions' )

2.2.1 - Train model (optimize hyperparameters)

In [67]:
%%time

# XGBoost
xgb_reg = xgb.XGBRegressor(objective='reg:squarederror')

# xgb_reg_params = {'n_estimators': [150, 200, 250, 300],
#                   'alpha': [2, 5],
#                   'learning_rate': np.arange(0.1, 1.1, 0.1),
#                   'max_depth': [5, 10, 20],
#                   'colsample_bytree': [0.3],
#                   'subsample': np.arange(0.1, 1.1, 0.1)}

xgb_reg_em_params = {'n_estimators': [500],
                     'alpha': [1.6], # selected from np.arange(1, 2, 0.1),
                     'learning_rate': [0.05],
                     'max_depth': [20],
                     'colsample_bytree': [0.35]
                    }


xgb_em_data = train_gridsearch(emissions_data,
                               xgb_reg,
                               xgb_reg_em_params)


print(xgb_em_data['model'])
XGBRegressor
--------------------------
Testing set performances :
- RMSE = 0.538
- R2 = 0.875
--------------------------------
Training set performances (CV) :
- RMSE = 0.52
- R2 = 0.867
XGBRegressor(alpha=1.6, base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.35, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.05, max_delta_step=0, max_depth=20,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=500, n_jobs=0, num_parallel_tree=1,
             objective='reg:squarederror', random_state=0, reg_alpha=1.60000002,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)
CPU times: user 3min 19s, sys: 1.06 s, total: 3min 20s
Wall time: 54.3 s

2.2.2 - Select most important features (method : cumulative feature importance selection)

In [68]:
xgb_reg_em_model = xgb_em_data['model']
xgb_reg_em_coefs = xgb_reg_em_model.feature_importances_

xgb_reg_em_features_coefs_df = get_features_importance(training_features,
                                                       xgb_reg_em_coefs,
                                                       abs_coefs=True,
                                                       non_zero_coefs=False,
                                                       verbose=False)

plot_cumulative_features_importance(xgb_reg_em_features_coefs_df, threshold=0.90, plot_size=(12, 6))
In [69]:
xgb_reg_em_mif_35_data, xgb_reg_em_mif_35_labels = select_most_important_features(xgb_reg_em_features_coefs_df,
                                                                                  n=35,
                                                                                  method='cumsum')
# Visualize n most important features
plot_n_top_features(xgb_reg_em_mif_35_data,
                    model_name(xgb_reg_em_model),
                    n=10,
                    plot_size=(12, 4))

2.2.3 - Second training cycle (with cumulative feature importance selection method)

In [70]:
%%time 

# Second training cycle with features reduced (173 -> 35)

xgb_reg_em_model_c2 = xgb.XGBRegressor(objective='reg:squarederror')


xgb_reg_em_reduced_data_35f, train_and_test_data_reduced_35f  = run_training_cycle(xgb_reg_em_mif_35_labels,
                                                                                   xgb_reg_em_params,
                                                                                   xgb_reg_em_model_c2) 
XGBRegressor
--------------------------
Testing set performances :
- RMSE = 0.298
- R2 = 0.932
--------------------------------
Training set performances (CV) :
- RMSE = 0.302
- R2 = 0.922
CPU times: user 34.4 s, sys: 189 ms, total: 34.6 s
Wall time: 9.27 s

2.2.4 - Display random decision tree (from cumulative feature importance selection method)

In [71]:
# Display naiv decision tree
xgb.plot_tree(xgb_reg_em_reduced_data_35f['model'], num_trees=5)
plt.show()

2.2.5 - Select most important features (method : threshold feature importance selection)

In [72]:
xgb_reg_em_mif_n_data, xgb_reg_em_mif_n_labels = select_most_important_features(xgb_reg_em_features_coefs_df,
                                                                                method='threshold',
                                                                                model=xgb_reg_em_model,
                                                                                thr='q',
                                                                                q_value=0.75, # Q3
                                                                                v=True)
32/128 selected features, reduction of 25.0%

2.2.6 - Second training cycle (with threshold feature selection method)

In [73]:
# Second training cycle with features reduced (173 -> 32)

xgb_reg_em_model_c2 = xgb.XGBRegressor(objective='reg:squarederror')

xgb_reg_em_reduced_data_32f, train_and_test_data_reduced_32f  = run_training_cycle(xgb_reg_em_mif_n_labels,
                                                                                   xgb_reg_em_params,
                                                                                   xgb_reg_em_model_c2) 
XGBRegressor
--------------------------
Testing set performances :
- RMSE = 0.298
- R2 = 0.932
--------------------------------
Training set performances (CV) :
- RMSE = 0.3
- R2 = 0.923

2.2.7 - Display random decision tree (from threshold feature selection method)

In [74]:
# Display naiv decision tree
xgb.plot_tree(xgb_reg_em_reduced_data_32f['model'], num_trees=5)
plt.show()

2.2.8 - Training curve

In [75]:
# Plot training curve

x_train_std_em_32f = train_and_test_data_reduced_32f['train'][0]

xgb_reg_em_params = {'objective': 'reg:squarederror',
                     'n_estimators': 500,
                     'alpha': 1.6,
                     'learning_rate': 0.05,
                     'max_depth': 20,
                     'colsample_bytree': 0.35}

plot_validation_curve(xgb.XGBRegressor(**xgb_reg_em_params),
                      x_train_std_em_32f,
                      y_train_emissions,
                      'n_estimators',
                      np.arange(200, 800, 200),
                      log_scale=False,
                      scorer='neg_root_mean_squared_error')

2.2.9 - Learning curve

In [76]:
# Plot learning curve

plot_learning_curve(xgb.XGBRegressor(**xgb_reg_em_params),
                    x_train_std_em_32f,
                    y_train_emissions,
                    train_sizes_ratio=np.linspace(0.1, 1, 10),
                    scorer='neg_root_mean_squared_error')

# Potential for improvement : Yes
In [77]:
xgb_reg_em_reduced_data_32f['learning_potential'] = 'Yes'

3 - Feature Selection : union of the variables selected by the models

In [78]:
# Get union from best models selected features for each target


# Energy best selected features


# Merge best selected features from models
en_total_best_features = enet_en_mif_29_labels + rfr_en_mif_7_labels + xgb_reg_en_mif_n_labels
# Extract best unique selected features from merged list
en_best_features = list(set(en_total_best_features))


# Emissions best selected features


# Merge best selected features from models
em_total_best_features = enet_em_mif_n_labels + rfr_em_mif_8_labels + xgb_reg_em_mif_n_labels
# Extract best unique selected features from merged list
em_best_features = list(set(em_total_best_features))

# Display best features count for each target
print('Energy target has {} best features'.format(len(en_best_features)))
print('Emissions target has {} best features'.format(len(em_best_features)))
Energy target has 39 best features
Emissions target has 41 best features

4 - Non linear models : Kernel SVR, MLPR

N.B :

Nonlinear models do not allow to measure in a simple way feature contributions from models

We will therefore proceed here differently by:

  • training our models from all the features.
  • carrying out a second training cycle based on the most relevant features selected by the best model (for each target)

1.1 Kernel SVR (target : 'SiteEnergyUse(kBtu)' )

1.1.1 - Train model (optimize hyperparameters)

In [157]:
%%time

kernel_svr_model = SVR(kernel='rbf')

kernel_svr_en_params = {'C': [50], # 250 # np.arange(50, 550, 50),  np.logspace(-3, 3, 10)
                        'gamma': [0.001],
                        'epsilon': [0.1]}

kernel_svr_en_data = train_gridsearch(energy_data,
                                      kernel_svr_model,
                                      kernel_svr_en_params)

kernel_svr_en_model = kernel_svr_en_data['model']
kernel_svr_en_model
SVR
--------------------------
Testing set performances :
- RMSE = 0.347
- R2 = 0.907
--------------------------------
Training set performances (CV) :
- RMSE = 0.35
- R2 = 0.893
CPU times: user 32.3 s, sys: 56 ms, total: 32.4 s
Wall time: 32.6 s
Out[157]:
SVR(C=50, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.001,
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

1.1.2 - Second training cycle (with feature selection)

In [158]:
%%time 

# Second training cycle with features reduced

kernel_svr_en_model = SVR(kernel='rbf')

# We test the different sets of selected features for each model by also adding the union of these
en_selected_feature_sets_dict = {'ElasticNet': enet_en_mif_29_labels,
                                 'RandomForestRegressor': rfr_en_mif_7_labels,
                                 'XGBoostRegressor': xgb_reg_en_mif_n_labels,
                                 'Union of features selected by each model': en_best_features}


def run_training_cycle_for_each_feature_set(feature_sets, model, model_param_grid):
    # Second cycle results dictionary 
    second_cycle_results = {'rmse': [],
                            'r2': [],
                            'time': [],
                            'total selected features': [],
                            'selected features by': []}
    # Run second training cycle for each selected features lists
    for model_of_feature_set, selected_features in feature_sets.items():
        
        model_reduced_data_nf, train_and_test_data_reduced_nf = run_training_cycle(selected_features,
                                                                                   model_param_grid,
                                                                                   model,
                                                                                   v=False)
        second_cycle_results['rmse'].append(model_reduced_data_nf['rmse'])
        second_cycle_results['r2'].append(model_reduced_data_nf['r2'])
        second_cycle_results['time'].append(model_reduced_data_nf['time'])
        second_cycle_results['total selected features'].append(len(selected_features))
        second_cycle_results['selected features by'].append(model_of_feature_set)
    # Build second cycle results
    second_cycle_results_df = pd.DataFrame(second_cycle_results)
    return second_cycle_results_df


kernel_svr_en_second_cycle_results_df = run_training_cycle_for_each_feature_set(en_selected_feature_sets_dict,
                                                                                kernel_svr_en_model,
                                                                                kernel_svr_en_params)
# Best performances for each feature set
kernel_svr_en_second_cycle_results_df
CPU times: user 21.8 s, sys: 41.1 ms, total: 21.9 s
Wall time: 21.9 s
Out[158]:
rmse r2 time total selected features selected features by
0 0.346 0.908 6 s 29 ElasticNet
1 0.466 0.833 3 s 7 RandomForestRegressor
2 0.356 0.903 6 s 32 XGBoostRegressor
3 0.345 0.908 7 s 39 Union of features selected by each model
In [82]:
# Corresponds to 10 variables from original dataset
enet_en_mif_29_labels
Out[82]:
['PropertyGFATotal',
 'ThirdLargestPropertyUseTypeGFA',
 'ENERGYSTARScore',
 'Main_energy_electricity',
 'North_south_dist',
 'BuildingType_Multifamily LR (1-4)',
 'BuildingType_Multifamily MR (5-9)',
 'BuildingType_Nonresidential COS',
 'PrimaryPropertyType_DistributionCenter',
 'PrimaryPropertyType_Hospital',
 'PrimaryPropertyType_Hotel',
 'PrimaryPropertyType_LargeOffice',
 'PrimaryPropertyType_MedicalOffice',
 'PrimaryPropertyType_Non-RefrigeratedWarehouse',
 'PrimaryPropertyType_RetailStore',
 'PrimaryPropertyType_SeniorCareCommunity',
 'PrimaryPropertyType_Small-andMid-SizedOffice',
 'PrimaryPropertyType_Supermarket/GroceryStore',
 'PrimaryPropertyType_Warehouse',
 'PrimaryPropertyType_WorshipFacility',
 'LargestPropertyUseType_Data Center',
 'LargestPropertyUseType_Other - Restaurant/Bar',
 'LargestPropertyUseType_Parking',
 'SecondLargestPropertyUseType_Data Center',
 'SecondLargestPropertyUseType_Laboratory',
 'SecondLargestPropertyUseType_Office',
 'SecondLargestPropertyUseType_Supermarket/Grocery Store',
 'Neighborhood_North',
 'Neighborhood_Southwest']
In [83]:
# Corresponds to 7 variables from original dataset
rfr_en_mif_7_labels
Out[83]:
['PropertyGFATotal',
 'ENERGYSTARScore',
 'PrimaryPropertyType_Supermarket/GroceryStore',
 'Main_energy_electricity',
 'North_south_dist',
 'East_west_dist',
 'SecondLargestPropertyUseTypeGFA']
In [154]:
# Less than 10 % per variable from difference (10 - 7) between ElasticNet and Random Forest feature selections
percentage_change(0.466, 0.346)
Out[154]:
-25.75
In [155]:
# Adding 3 variables compared to Random Forest total feature selection (7) 
# increases the total amount of variables required by approximately 43%
percentage_change(7, 10)
Out[155]:
42.86
In [159]:
# Train best Kernel SVR model with appropriate selected features list
# Best model is selected by arbitrating between feature total and score (RMSE)
# Fewer features mean less data acquisition costs
# we select Random Forest feature selection
kernel_svr_en_reduced_data_7f, train_and_test_data_reduced_7f  = run_training_cycle(rfr_en_mif_7_labels,
                                                                                    kernel_svr_en_params,
                                                                                    kernel_svr_en_model,
                                                                                    v=False)

1.1.3 - Training curve

In [160]:
x_train_std_en_7f = train_and_test_data_reduced_7f['train'][0]

kernel_svr_en_params = {'kernel': 'rbf',
                        'C': 50,
                        'gamma': 0.001,
                        'epsilon': 0.1}

plot_validation_curve(SVR(**kernel_svr_en_params),
                      x_train_std_en_7f,
                      y_train_energy,
                      'C',
                      np.arange(1, 100, 10),
                      log_scale=False,
                      scorer='neg_root_mean_squared_error')

1.1.4 - Learning curve

In [161]:
# Plot learning curve

# N.B : function from mlearn evaluator module
plot_learning_curve(SVR(**kernel_svr_en_params),
                    x_train_std_en_7f,
                    y_train_energy,
                    train_sizes_ratio=np.linspace(0.1, 1, 10),
                    scorer='neg_root_mean_squared_error')

# Potential for improvement : No
In [163]:
kernel_svr_en_reduced_data_7f['learning_potential'] = 'No'

1.2 Kernel SVR (target : 'TotalGHGEmissions' )

1.2.1 - Train model (optimize hyperparameters)

In [222]:
%%time

kernel_svr_model = SVR(kernel='rbf')

kernel_svr_em_params = {'C': [50], # 300
                        'gamma': [0.001], # 0.01
                        'epsilon': [0.1]}

kernel_svr_em_data = train_gridsearch(emissions_data,
                                      kernel_svr_model,
                                      kernel_svr_em_params)

kernel_svr_em_data['model']
CPU times: user 20 µs, sys: 2 µs, total: 22 µs
Wall time: 26.9 µs

1.2.2 - Second training cycle (with feature selection)

In [165]:
%%time 

# Second training cycle with features reduced

kernel_svr_em_model = SVR(kernel='rbf')

# We test the different sets of selected features for each model by also adding the union of these
em_selected_feature_sets_dict = {'ElasticNet': enet_em_mif_n_labels,
                                 'RandomForestRegressor': rfr_em_mif_8_labels,
                                 'XGBoostRegressor': xgb_reg_em_mif_n_labels,
                                 'Union of features selected by each model': em_best_features}

kernel_svr_em_second_cycle_results_df = run_training_cycle_for_each_feature_set(em_selected_feature_sets_dict,
                                                                                kernel_svr_em_model,
                                                                                kernel_svr_em_params)

# Best performances for each feature set
kernel_svr_em_second_cycle_results_df
CPU times: user 24.4 s, sys: 81.7 ms, total: 24.5 s
Wall time: 24.5 s
Out[165]:
rmse r2 time total selected features selected features by
0 0.348 0.907 6 s 28 ElasticNet
1 0.456 0.840 4 s 8 RandomForestRegressor
2 0.364 0.898 6 s 32 XGBoostRegressor
3 0.346 0.908 8 s 41 Union of features selected by each model
In [92]:
# Corresponds to 11 variables from original dataset
enet_em_mif_n_labels
Out[92]:
['PropertyGFATotal',
 'ENERGYSTARScore',
 'Main_energy_electricity',
 'North_south_dist',
 'OldBuilding',
 'BuildingType_Multifamily LR (1-4)',
 'BuildingType_Multifamily MR (5-9)',
 'PrimaryPropertyType_DistributionCenter',
 'PrimaryPropertyType_Hospital',
 'PrimaryPropertyType_Hotel',
 'PrimaryPropertyType_Non-RefrigeratedWarehouse',
 'PrimaryPropertyType_RetailStore',
 'PrimaryPropertyType_SeniorCareCommunity',
 'PrimaryPropertyType_Small-andMid-SizedOffice',
 'PrimaryPropertyType_Supermarket/GroceryStore',
 'PrimaryPropertyType_Warehouse',
 'PrimaryPropertyType_WorshipFacility',
 'LargestPropertyUseType_Data Center',
 'LargestPropertyUseType_Other - Restaurant/Bar',
 'SecondLargestPropertyUseType_Data Center',
 'SecondLargestPropertyUseType_Office',
 'SecondLargestPropertyUseType_Retail Store',
 'SecondLargestPropertyUseType_Supermarket/Grocery Store',
 'ThirdLargestPropertyUseType_Other - Education',
 'Neighborhood_Delridge',
 'Neighborhood_East',
 'Neighborhood_Southeast',
 'Neighborhood_Southwest']
In [93]:
# Corresponds to 8 variables from original dataset
rfr_em_mif_8_labels
Out[93]:
['PropertyGFATotal',
 'Main_energy_electricity',
 'ENERGYSTARScore',
 'North_south_dist',
 'East_west_dist',
 'PrimaryPropertyType_Supermarket/GroceryStore',
 'BuildingType_Multifamily LR (1-4)',
 'SecondLargestPropertyUseTypeGFA']
In [167]:
# Less than 8 % per variable from difference (11 - 8) between ElasticNet and Random Forest feature selections
percentage_change(0.456, 0.348)
Out[167]:
-23.68
In [166]:
# Adding 3 variables compared to Random Forest total feature selection (7) 
# increases the total amount of variables required by approximately 38%
percentage_change(8, 11)
Out[166]:
37.5
In [223]:
# Train best Kernel SVR model with appropriate selected features list
# Best model is selected by arbitrating between feature total and score (RMSE)
# Fewer features mean less data acquisition costs
# we select Random Forest feature selection
kernel_svr_em_reduced_data_8f, train_and_test_data_reduced_8f  = run_training_cycle(rfr_em_mif_n_labels,
                                                                                    kernel_svr_em_params,
                                                                                    kernel_svr_em_model,
                                                                                    v=False)

1.2.3 - Training curve

In [169]:
x_train_std_em_8f = train_and_test_data_reduced_8f['train'][0]

kernel_svr_em_params = {'kernel': 'rbf',
                        'C': 50,
                        'gamma': 0.001,
                        'epsilon': 0.1}

plot_validation_curve(SVR(**kernel_svr_em_params),
                      x_train_std_em_8f,
                      y_train_emissions,
                      'C',
                      np.arange(1, 100, 10),
                      log_scale=False,
                      scorer='neg_root_mean_squared_error')

1.2.4 - Learning curve

In [170]:
# Plot learning curve


# N.B : function from mlearn evaluator module
plot_learning_curve(SVR(**kernel_svr_em_params),
                    x_train_std_em_8f,
                    y_train_emissions,
                    train_sizes_ratio=np.linspace(0.1, 1, 10),
                    scorer='neg_root_mean_squared_error')

# Potential for improvement : No
In [224]:
kernel_svr_em_reduced_data_8f['learning_potential'] = 'No'

2.1 - Multi-Layer Perceptron Regressor (target : 'SiteEnergyUse(kBtu)' )

2.1.1 - Train model (optimize hyperparameters)

In [172]:
%%time 

# Multi-Layer Perceptron Regressor

mlpr_model = MLPRegressor(activation='relu', #'identity',
                          learning_rate='adaptive',
                          alpha=4,
                          max_iter=5000,
                          verbose=False,
                          random_state=42)

mlpr_en_params = {'hidden_layer_sizes': (16, 3), # pow(16, 3) < df.shape[0]
                  'learning_rate_init': [0.0008]} # 0.001

mlpr_en_data = train_gridsearch(energy_data,
                                mlpr_model,
                                mlpr_en_params)
mlpr_en_data['model']
MLPRegressor
--------------------------
Testing set performances :
- RMSE = 0.352
- R2 = 0.905
--------------------------------
Training set performances (CV) :
- RMSE = 0.347
- R2 = 0.895
CPU times: user 7min 37s, sys: 4min 8s, total: 11min 46s
Wall time: 4min 24s
Out[172]:
MLPRegressor(activation='relu', alpha=4, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=16, learning_rate='adaptive',
             learning_rate_init=0.0008, max_fun=15000, max_iter=5000,
             momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
             power_t=0.5, random_state=42, shuffle=True, solver='adam',
             tol=0.0001, validation_fraction=0.1, verbose=False,
             warm_start=False)

2.1.2 - Second training cycle (with feature selection)

In [173]:
%%time 

# Second training cycle with features reduced

mlpr_en_model = MLPRegressor(activation='relu',
                             learning_rate='adaptive',
                             alpha=4,
                             max_iter=5000,
                             verbose=False,
                             random_state=42)

# Build second cycle results
mlpr_en_second_cycle_results_df = run_training_cycle_for_each_feature_set(en_selected_feature_sets_dict,
                                                                          mlpr_en_model,
                                                                          mlpr_en_params)
# Best performances for each feature set 
mlpr_en_second_cycle_results_df
CPU times: user 3min 2s, sys: 3.37 s, total: 3min 5s
Wall time: 2min 59s
Out[173]:
rmse r2 time total selected features selected features by
0 0.355 0.903 46 s 29 ElasticNet
1 0.467 0.832 44 s 7 RandomForestRegressor
2 0.369 0.895 44 s 32 XGBoostRegressor
3 0.359 0.901 45 s 39 Union of features selected by each model
In [174]:
mlpr_en_reduced_data_7f, train_and_test_data_reduced_7f  = run_training_cycle(rfr_en_mif_7_labels,
                                                                                mlpr_en_params,
                                                                                mlpr_en_model,
                                                                                v=False)

2.1.3 - Training curve

In [175]:
x_train_std_en_7f = train_and_test_data_reduced_7f['train'][0]

mlpr_en_params = {'activation': 'relu',
                  'learning_rate': 'adaptive',
                  'alpha': 4,
                  'max_iter': 5000,
                  'hidden_layer_sizes': (16, 3),
                  'learning_rate_init': 0.0008,
                  'random_state': 42}

plot_validation_curve(MLPRegressor(**mlpr_en_params),
                      x_train_std_en_7f,
                      y_train_energy,
                      'learning_rate_init',
                      np.arange(0.0001, 0.001, 0.0001),
                      log_scale=False,
                      scorer='neg_root_mean_squared_error')

# 0.0002 seems to be a better learning rate 
In [178]:
# Run another training cycle with a learning rate = 0.0002

mlpr_en_model = MLPRegressor(activation='relu',
                             learning_rate='adaptive',
                             alpha=4,
                             max_iter=10000,
                             verbose=False,
                             random_state=42)

mlpr_en_params = {'hidden_layer_sizes': (16, 3),
                  'learning_rate_init': [0.0001]} 

mlpr_en_reduced_data_7f, train_and_test_data_reduced_7f  = run_training_cycle(rfr_en_mif_7_labels,
                                                                              mlpr_en_params,
                                                                              mlpr_en_model,
                                                                              v=True)
MLPRegressor
--------------------------
Testing set performances :
- RMSE = 0.466
- R2 = 0.833
--------------------------------
Training set performances (CV) :
- RMSE = 0.459
- R2 = 0.819

2.1.4 - Learning curve

In [179]:
# Plot learning curve

x_train_std_en_7f = train_and_test_data_reduced_7f['train'][0]

mlpr_en_params = {'activation': 'relu',
                  'learning_rate': 'adaptive',
                  'alpha': 4,
                  'max_iter': 50000,
                  'hidden_layer_sizes': (16, 3),
                  'learning_rate_init': 0.0001,
                  'random_state': 42}

# N.B : function from mlearn evaluator module
plot_learning_curve(MLPRegressor(**mlpr_en_params),
                    x_train_std_en_7f,
                    y_train_energy,
                    train_sizes_ratio=np.linspace(0.1, 1, 10),
                    scorer='neg_root_mean_squared_error')

# Potential for improvement : No
In [215]:
mlpr_en_reduced_data_7f['learning_potential'] = 'No'

2.2 - Multi-Layer Perceptron Regressor (target : 'TotalGHGEmissions' )

2.2.1 - Train model (optimize hyperparameters)

In [180]:
%%time 

mlpr_model = MLPRegressor(activation='relu', #'identity',
                          learning_rate='adaptive',
                          alpha=4,
                          max_iter=5000,
                          verbose=False,
                          random_state=42)

mlpr_em_params = {'hidden_layer_sizes': (16, 3), # pow(16, 3) < df.shape[0]
                  'learning_rate_init': [0.00075]} # 0.001

mlpr_em_data = train_gridsearch(emissions_data,
                                mlpr_model,
                                mlpr_em_params)

mlpr_em_data['model']
MLPRegressor
--------------------------
Testing set performances :
- RMSE = 0.669
- R2 = 0.806
--------------------------------
Training set performances (CV) :
- RMSE = 0.646
- R2 = 0.793
CPU times: user 3min 11s, sys: 1min 46s, total: 4min 57s
Wall time: 1min 39s
Out[180]:
MLPRegressor(activation='relu', alpha=4, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=16, learning_rate='adaptive',
             learning_rate_init=0.00075, max_fun=15000, max_iter=5000,
             momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
             power_t=0.5, random_state=42, shuffle=True, solver='adam',
             tol=0.0001, validation_fraction=0.1, verbose=False,
             warm_start=False)

2.2.2 - Second training cycle (with feature selection)

In [181]:
%%time 

# Second training cycle with features reduced

mlpr_em_model = MLPRegressor(activation='relu',
                             learning_rate='adaptive',
                             alpha=4,
                             max_iter=5000,
                             verbose=False,
                             random_state=42)

# Build second cycle results
mlpr_em_second_cycle_results_df = run_training_cycle_for_each_feature_set(em_selected_feature_sets_dict,
                                                                          mlpr_em_model,
                                                                          mlpr_em_params)
mlpr_em_second_cycle_results_df
# Best performances came from en_best_features (energy selected features union)
CPU times: user 3min 17s, sys: 3.04 s, total: 3min 20s
Wall time: 3min 15s
Out[181]:
rmse r2 time total selected features selected features by
0 0.361 0.900 50 s 28 ElasticNet
1 0.463 0.835 45 s 8 RandomForestRegressor
2 0.376 0.891 50 s 32 XGBoostRegressor
3 0.363 0.899 50 s 41 Union of features selected by each model
In [182]:
mlpr_em_reduced_data_8f, train_and_test_data_reduced_8f  = run_training_cycle(rfr_em_mif_8_labels,
                                                                              mlpr_em_params,
                                                                              mlpr_em_model,
                                                                              v=False)

2.2.3 - Training curve

In [183]:
x_train_std_em_8f = train_and_test_data_reduced_8f['train'][0]

mlpr_em_params = {'activation': 'relu',
                  'learning_rate': 'adaptive',
                  'alpha': 4,
                  'max_iter': 50000,
                  'hidden_layer_sizes': (16, 3),
                  'learning_rate_init': 0.00075,
                  'random_state': 42}

plot_validation_curve(MLPRegressor(**mlpr_em_params),
                      x_train_std_em_8f,
                      y_train_emissions,
                      'learning_rate_init',
                      np.arange(0.0001, 0.001, 0.0001),
                      log_scale=False,
                      scorer='neg_root_mean_squared_error')
In [112]:
# 0.0003 seems to be a better value for learning rate 

mlpr_em_model = MLPRegressor(activation='relu',
                             learning_rate='adaptive',
                             alpha=4,
                             max_iter=5000,
                             verbose=False,
                             random_state=42)

mlpr_em_params = {'hidden_layer_sizes': (16, 3),
                  'learning_rate_init': [0.0003]}

mlpr_em_reduced_data_28f, train_and_test_data_reduced_28f  = run_training_cycle(enet_em_mif_n_labels,
                                                                                mlpr_em_params,
                                                                                mlpr_em_model,
                                                                                v=True)
MLPRegressor
--------------------------
Testing set performances :
- RMSE = 0.359
- R2 = 0.901
--------------------------------
Training set performances (CV) :
- RMSE = 0.365
- R2 = 0.885

2.2.4 - Learning curve

In [184]:
# Plot learning curve

x_train_std_em_8f = train_and_test_data_reduced_8f['train'][0]

mlpr_em_params = {'activation': 'relu',
                  'learning_rate': 'adaptive',
                  'alpha': 4,
                  'max_iter': 50000,
                  'hidden_layer_sizes': (16, 3),
                  'learning_rate_init': 0.0003,
                  'random_state': 42}

# N.B : function from mlearn evaluator module
plot_learning_curve(MLPRegressor(**mlpr_em_params),
                    x_train_std_em_8f,
                    y_train_emissions,
                    train_sizes_ratio=np.linspace(0.1, 1, 10),
                    scorer='neg_root_mean_squared_error')

# Potential for improvement : Yes
In [216]:
mlpr_em_reduced_data_8f['learning_potential'] = 'Yes'

5 - Third training cycle with models union features : RandomForest, XGBoost

1.1 - Random forest (target : 'SiteEnergyUse(kBtu)' )

1.1.1 - Third training cycle (with feature selection)

In [187]:
%%time 

# Third training cycle with features reduced

rfr_en_model = RandomForestRegressor(random_state=42)

rfr_en_params = {'n_estimators': [100],
                 'min_samples_leaf': [1]}

rfr_en_third_cycle_results_df = run_training_cycle_for_each_feature_set(en_selected_feature_sets_dict,
                                                                        rfr_en_model,
                                                                        rfr_en_params)
rfr_en_third_cycle_results_df
CPU times: user 35 s, sys: 118 ms, total: 35.1 s
Wall time: 35.1 s
Out[187]:
rmse r2 time total selected features selected features by
0 0.339 0.912 8 s 29 ElasticNet
1 0.371 0.894 7 s 7 RandomForestRegressor
2 0.333 0.915 10 s 32 XGBoostRegressor
3 0.333 0.915 10 s 39 Union of features selected by each model

1.2 - Random forest (target : 'TotalGHGEmissions')

1.2.1 - Third training cycle (with feature selection)

In [190]:
%%time 

# Third training cycle with features reduced

rfr_em_model = RandomForestRegressor(random_state=42)

rfr_em_params = {'n_estimators': [200],
                 'min_samples_leaf': [1]}

rfr_em_third_cycle_results_df = run_training_cycle_for_each_feature_set(em_selected_feature_sets_dict,
                                                                        rfr_em_model,
                                                                        rfr_em_params)
rfr_em_third_cycle_results_df
CPU times: user 1min 9s, sys: 304 ms, total: 1min 9s
Wall time: 1min 9s
Out[190]:
rmse r2 time total selected features selected features by
0 0.345 0.909 15 s 28 ElasticNet
1 0.365 0.897 14 s 8 RandomForestRegressor
2 0.337 0.913 19 s 32 XGBoostRegressor
3 0.336 0.913 21 s 41 Union of features selected by each model

2.1 - XGBoost (target : 'SiteEnergyUse(kBtu)' )

2.1.1 - Third training cycle (with feature selection)

In [191]:
%%time 

# Third training cycle with features reduced

xgb_reg_en_model = xgb.XGBRegressor(objective='reg:squarederror')

xgb_reg_en_params = {'n_estimators': [500],
                     'alpha': [0.5],
                     'learning_rate': [0.062],
                     'max_depth': [20],
                     'colsample_bytree': [0.35]
                     }

xgb_reg_en_third_cycle_results_df = run_training_cycle_for_each_feature_set(en_selected_feature_sets_dict,
                                                                            xgb_reg_en_model,
                                                                            xgb_reg_en_params)
xgb_reg_en_third_cycle_results_df
CPU times: user 2min 14s, sys: 1.01 s, total: 2min 15s
Wall time: 36.9 s
Out[191]:
rmse r2 time total selected features selected features by
0 0.288 0.936 10 s 29 ElasticNet
1 0.359 0.901 6 s 7 RandomForestRegressor
2 0.283 0.938 10 s 32 XGBoostRegressor
3 0.292 0.935 11 s 39 Union of features selected by each model
In [192]:
xgb_reg_en_model = xgb.XGBRegressor(objective='reg:squarederror')

xgb_reg_en_reduced_data_7f, train_and_test_data_reduced_7f  = run_training_cycle(rfr_en_mif_7_labels,
                                                                                 xgb_reg_en_params,
                                                                                 xgb_reg_en_model,
                                                                                 v=False)

2.1.2 - Training curve

In [193]:
# Plot training curve

x_train_std_en_7f = train_and_test_data_reduced_7f['train'][0]


xgb_reg_en_params = {'objective': 'reg:squarederror',
                     'n_estimators': 500,
                     'alpha': 0.5,
                     'learning_rate': 0.062,
                     'max_depth': 20,
                     'colsample_bytree': 0.35}


plot_validation_curve(xgb.XGBRegressor(**xgb_reg_en_params),
                      x_train_std_en_7f,
                      y_train_energy,
                      'n_estimators',
                      np.arange(200, 800, 100),
                      log_scale=False,
                      scorer='neg_root_mean_squared_error')

2.1.3 - Learning curve

In [194]:
# Plot learning curve

plot_learning_curve(xgb.XGBRegressor(**xgb_reg_en_params),
                    x_train_std_en_7f,
                    y_train_energy,
                    train_sizes_ratio=np.linspace(0.1, 1, 10),
                    scorer='neg_root_mean_squared_error')

# Potential for improvement : Yes
In [195]:
xgb_reg_en_reduced_data_7f['learning_potential'] = 'Yes'

2.2 - XGBoost (target : 'TotalGHGEmissions' )

2.2.1 - Third training cycle (with feature selection)

In [196]:
%%time 

# Third training cycle with features reduced

xgb_reg_em_model = xgb.XGBRegressor(objective='reg:squarederror')

xgb_reg_em_params = {'n_estimators': [500],
                     'alpha': [1.6], 
                     'learning_rate': [0.05],
                     'max_depth': [20],
                     'colsample_bytree': [0.35]
                    }

xgb_reg_em_third_cycle_results_df = run_training_cycle_for_each_feature_set(em_selected_feature_sets_dict,
                                                                            xgb_reg_em_model,
                                                                            xgb_reg_em_params)
xgb_reg_em_third_cycle_results_df
CPU times: user 2min 7s, sys: 1.42 s, total: 2min 8s
Wall time: 35.5 s
Out[196]:
rmse r2 time total selected features selected features by
0 0.313 0.925 8 s 28 ElasticNet
1 0.374 0.892 6 s 8 RandomForestRegressor
2 0.298 0.932 9 s 32 XGBoostRegressor
3 0.294 0.934 12 s 41 Union of features selected by each model
In [197]:
xgb_reg_em_model = xgb.XGBRegressor(objective='reg:squarederror')

xgb_reg_em_reduced_data_8f, train_and_test_data_reduced_8f  = run_training_cycle(rfr_em_mif_8_labels,
                                                                                   xgb_reg_em_params,
                                                                                   xgb_reg_em_model,
                                                                                   v=False)

2.2.2 - Training curve

In [198]:
# Plot training curve

x_train_std_em_8f = train_and_test_data_reduced_8f['train'][0]


xgb_reg_em_params = {'objective': 'reg:squarederror',
                     'n_estimators': 500,
                     'alpha': 1.6,
                     'learning_rate': 0.05,
                     'max_depth': 20,
                     'colsample_bytree': 0.35}


plot_validation_curve(xgb.XGBRegressor(**xgb_reg_em_params),
                      x_train_std_em_8f,
                      y_train_emissions,
                      'n_estimators',
                      np.arange(200, 800, 100),
                      log_scale=False,
                      scorer='neg_root_mean_squared_error')

2.2.3 - Learning curve

In [199]:
# Plot learning curve

plot_learning_curve(xgb.XGBRegressor(**xgb_reg_em_params),
                    x_train_std_em_8f,
                    y_train_emissions,
                    train_sizes_ratio=np.linspace(0.1, 1, 10),
                    scorer='neg_root_mean_squared_error')

# Potential for improvement : Yes
In [200]:
xgb_reg_em_reduced_data_8f['learning_potential'] = 'Yes'

IV - Results

1 - Training results

In [225]:
# Best models for energy target (model data dictionaries from train_gridsearch wrapper)
best_en_models = [enet_en_reduced_data_29f,
                  rfr_en_reduced_data_7f,
                  xgb_reg_en_reduced_data_7f,
                  kernel_svr_en_reduced_data_7f,
                  mlpr_en_reduced_data_7f]

# Best models for emissions target (model data dictionaries from train_gridsearch wrapper)
best_em_models = [enet_em_reduced_data_28f,
                  rfr_em_reduced_data_8f,
                  xgb_reg_em_reduced_data_8f,
                  kernel_svr_em_reduced_data_8f,
                  mlpr_em_reduced_data_8f]

# Build main evaluation variables list from a random model data dictionary
model_data_keys = dict(list(rfr_en_reduced_data_7f.items())[2:])
# --> ['model_name', 'rmse', 'r2', 'time', 'n_features', 'learning_potential']

# Build main evaluation variables for result dataframes
training_results_keys = ['Model', 'RMSE', 'R2', 'Run time', 'Selected features', 'Learning potential']

# Build result dictionaries for each target
energy_training_results = {k: [] for k in training_results_keys}
emissions_training_results = {k: [] for k in training_results_keys}

# Fill result dictionaries with model data for each target
for en_model, em_model in zip(best_en_models, best_em_models):
    for k1, k2 in zip(training_results_keys, model_data_keys):
        energy_training_results[k1].append(en_model[k2])
        emissions_training_results[k1].append(em_model[k2])


target_dicts = [energy_training_results, emissions_training_results]

# Build result dataframes for each target
en_results_df, em_results_df = [pd.DataFrame(target_dict) for target_dict in target_dicts]

1.1 - Training results (target : 'SiteEnergyUse(kBtu)')

In [226]:
en_results_df
# Best model XGBRegressor
Out[226]:
Model RMSE R2 Run time Selected features Learning potential
0 ElasticNetCV 0.361 0.900 29 No
1 RandomForestRegressor 0.371 0.894 7 s 7 Yes
2 XGBRegressor 0.359 0.901 9 s 7 Yes
3 SVR 0.466 0.833 4 s 7 No
4 MLPRegressor 0.466 0.833 2 min 23 s 7 No

1.2 - Training results (target : 'TotalGHGEmissions')

In [227]:
em_results_df
# Best model XGBRegressor
Out[227]:
Model RMSE R2 Run time Selected features Learning potential
0 ElasticNetCV 0.364 0.898 28 No
1 RandomForestRegressor 0.365 0.897 9 s 8 Yes
2 XGBRegressor 0.374 0.892 6 s 8 Yes
3 SVR 0.368 0.896 7 s 32 No
4 MLPRegressor 0.463 0.835 45 s 8 Yes

1.3 - Save results data

In [228]:
# Energy target ('SiteEnergyUse(kBtu)') results
en_results_data = {'en_results_df': en_results_df,
                   'en_best_model': xgb_reg_en_reduced_data_7f,
                   'en_best_features': rfr_en_mif_7_labels}

# Emissions target ('TotalGHGEmissions') results
em_results_data = {'em_results_df': em_results_df,
                   'em_best_model': xgb_reg_em_reduced_data_8f,
                   'em_best_features': rfr_em_mif_8_labels}

# Merge target results into another dictionary
results_data = {'en': en_results_data, 'em': em_results_data}

# Save it as a .pkl file
pickle_data(filename='main_results_ENERGYSTARScore',
            folder='../data/pkl',
            data=results_data,
            method='w')

2 - Data acquisition : best selected features

In [208]:
# Check if the selected features are identical from one target to another
rfr_en_mif_7_labels == rfr_em_mif_8_labels
# The selected features are different from one target to another
Out[208]:
False

2.1 - Selected features ( target : 'SiteEnergyUse(kBtu)' )

In [209]:
rfr_en_mif_7_labels
Out[209]:
['PropertyGFATotal',
 'ENERGYSTARScore',
 'PrimaryPropertyType_Supermarket/GroceryStore',
 'Main_energy_electricity',
 'North_south_dist',
 'East_west_dist',
 'SecondLargestPropertyUseTypeGFA']
In [210]:
# Plot feature importance from best model 
xgb.plot_importance(xgb_reg_en_reduced_data_7f['model'])
# Total number of variables required : 7 
Out[210]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f9a423479d0>

2.2 - Selected features ( target : 'TotalGHGEmissions' )

In [211]:
rfr_em_mif_8_labels
Out[211]:
['PropertyGFATotal',
 'Main_energy_electricity',
 'ENERGYSTARScore',
 'North_south_dist',
 'East_west_dist',
 'PrimaryPropertyType_Supermarket/GroceryStore',
 'BuildingType_Multifamily LR (1-4)',
 'SecondLargestPropertyUseTypeGFA']
In [212]:
xgb.plot_importance(xgb_reg_em_reduced_data_8f['model'])
# Total number of variables required : 8
Out[212]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f9a4055d550>